import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
data = pd.read_csv('Training Data.csv')
data.head() #produce top 10 rows
data.info()
data.describe()
print(data.groupby('Adherence').size())
print(data.shape)
data.isnull().count() # no null values
data.isna().count() # no missing values
#!pip install plotly
import plotly
import plotly.offline as pyoff
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
temp = data.Adherence.value_counts()
trace = go.Bar(x=temp.index,
y= np.round(temp.astype(float)/temp.values.sum(),2),
text = np.round(temp.astype(float)/temp.values.sum(),2),
textposition = 'auto',
name = 'Adherence')
data1 = [trace]
layout = go.Layout(
autosize=False,
width=600,
height=400,title = "Adherence Distribution"
)
fig = go.Figure(data=data1, layout=layout)
iplot(fig)
del temp
print(data.groupby('Gender').size())
print('-------------------------------------------------')
sns.factorplot('Gender',data=data,kind='count')
print(data.groupby('Diabetes').size())
print('--------------------------------------------')
sns.factorplot('Diabetes',data=data,kind='count')
print(data.groupby('Alcoholism').size())
print('---------------------------------------------')
sns.factorplot('Alcoholism',data=data,kind='count')
print(data.groupby('HyperTension').size())
sns.factorplot('HyperTension',data=data,kind='count')
print(data.groupby('Smokes').size())
sns.factorplot('Smokes',data=data,kind='count')
print(data.groupby('Tuberculosis').size())
sns.factorplot('Tuberculosis',data=data,kind='count')
Gender_Adherence = data.groupby(['Gender','Adherence']).size().to_frame()
Gender_Adherence = Gender_Adherence.reset_index()
Gender_Adherence.columns = ['Gender','Adherence','Count']
Gender_Adherence
trace1 = go.Bar(x = Gender_Adherence.Gender[Gender_Adherence.Adherence=='Yes'],
y = Gender_Adherence.Count[Gender_Adherence.Adherence=='Yes'],
text = Gender_Adherence.Count[Gender_Adherence.Adherence=='Yes'],
textposition = 'auto',
name = 'Yes')
trace2 = go.Bar(x = Gender_Adherence.Gender[Gender_Adherence.Adherence=='No'],
y = Gender_Adherence.Count[Gender_Adherence.Adherence=='No'],
text = Gender_Adherence.Count[Gender_Adherence.Adherence=='Yes'],
textposition = 'auto',
name = 'No')
tempdata = [trace1,trace2]
layout = go.Layout(width = 800,
height = 600,title = 'Gender and Adherence')
fig = go.Figure(data=tempdata, layout=layout)
iplot(fig)
print(data.groupby(['Tuberculosis','Adherence']).size())
sns.factorplot('Tuberculosis',data=data,hue='Adherence',kind='count')
print(data.groupby(['Smokes','Adherence']).size())
sns.factorplot('Smokes',data=data,hue='Adherence',kind='count')
print(data.groupby(['HyperTension','Adherence']).size())
sns.catplot('HyperTension',data=data,hue='Adherence',kind='count')
print(data.groupby(['Alcoholism','Adherence']).size())
sns.factorplot('Alcoholism',data=data,hue='Adherence',kind='count')
print(data.groupby(['Diabetes','Adherence']).size())
sns.factorplot('Diabetes',data=data,hue='Adherence',kind='count')
data['Adherence'] = data['Adherence'].apply(lambda x: 1 if x == 'Yes' else 0)
data['Gender'] = data['Gender'].apply(lambda x: 1 if x == 'M' else 0)
data = data.set_index('patient_id')
data.head()
data.groupby(['Diabetes','Adherence']).size()
data.groupby(['Diabetes','Alcoholism','Adherence']).size()
data.groupby('Adherence').size()
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data.drop('Adherence', axis = 1),data['Adherence'], test_size=0.20,random_state=101)
X_train.head()
X_test.head()
Y_train.head()
Y_test.head()
from sklearn.linear_model import LogisticRegression
logClassifier = LogisticRegression()
logClassifier.fit(X_train, Y_train)
predictions = logClassifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
print('##### Classification Report of Logistic Regression #####')
print(classification_report(Y_test,predictions))
print('##### Confusion Matrix of Logistic Regression #####')
print(confusion_matrix(Y_test,predictions))
from sklearn.neighbors import KNeighborsClassifier
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_3.fit(X_train, Y_train)
knn_pred = knn_3.predict(X_test)
print('##### Classification Report of KNN Classfier #####')
print()
print(classification_report(Y_test,knn_pred))
print('##### Confusion Matrix of KNN Classfier #####')
print()
print(confusion_matrix(Y_test,knn_pred))
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
dt_pred = dt.predict(X_test)
print('##### Classification Report of Decision Tree Classfier #####')
print()
print(classification_report(Y_test,dt_pred))
print('##### Confusion Matrix of Decision Tree Classfier #####')
print()
print(confusion_matrix(Y_test,dt_pred))
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
rf_pred = rf.predict(X_test)
print('##### Classification Report of Random Forest Classfier #####')
print()
print(classification_report(Y_test,rf_pred))
print('##### Confusion Matrix of Random Forest Classfier #####')
print()
print(confusion_matrix(Y_test,rf_pred))
from sklearn.ensemble import AdaBoostClassifier
classifier = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1),
n_estimators=200
)
classifier.fit(X_train, Y_train)
adaboost_pred = classifier.predict(X_test)
print('##### Classification Report of Adaboost Classfier #####')
print()
print(classification_report(Y_test,adaboost_pred))
print('##### Confusion Matrix of Adaboost Classfier #####')
print()
print(confusion_matrix(Y_test,adaboost_pred))
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, Y_train)
print("Learning rate: ", learning_rate)
print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, Y_train)))
print("Accuracy score (testing): {0:.3f}".format(gb.score(X_test, Y_test)))
gb_pred = gb.predict(X_test)
print('Classification Report')
print(classification_report(Y_test,gb_pred))
print('Confusion Matrix')
print(confusion_matrix(Y_test,gb_pred))
print('------------------------------------------------------')
learning_rate = 0.75
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, Y_train)
print("Learning rate: ", learning_rate)
print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, Y_train)))
print("Accuracy score (testing): {0:.3f}".format(gb.score(X_test, Y_test)))
gb_pred = gb.predict(X_test)
print('##### Classification Report #####')
print(classification_report(Y_test,gb_pred))
print('##### Confusion Matrix #####')
print(confusion_matrix(Y_test,gb_pred))
# ROC curve and Area-Under-Curve (AUC)
y_scores_gb = gb.decision_function(X_test)
fpr_gb, tpr_gb, _ = roc_curve(Y_test, y_scores_gb)
roc_auc_gb = auc(fpr_gb, tpr_gb)
print("Area under ROC curve = {:0.2f}".format(roc_auc_gb))
test_data = pd.read_csv('Test Data.csv')
test_data['Gender'] = test_data['Gender'].apply(lambda x: 1 if x == 'M' else 0)
test_data = test_data.set_index('patient_id')
test_data.head()
predictions_test = gb.predict(test_data)
predictedProbailityScoresForEachClass = gb.predict_proba(test_data)
prob = pd.DataFrame(predictedProbailityScoresForEachClass)
prob.head()
result = pd.DataFrame()
result['patient_id']=test_data.index
result['adherence'] = predictions_test
result['prob_being_yes'] = predictedProbailityScoresForEachClass[:,1]
result['prob_being_no'] = predictedProbailityScoresForEachClass[:,0]
result.head()
result['prob_score'] = result[["prob_being_yes", "prob_being_no"]].max(axis=1)
result['adherence'] = result['adherence'].apply(lambda x: 'Yes' if x == 1 else 'No')
result.head()
result.drop(['prob_being_yes','prob_being_no'], axis=1, inplace=True)
result.head()
result = result.set_index('patient_id')
result.head()
result.to_csv('result.csv')